********************************************************************************
*DYNAMIC IMPACTS OF PRICING GROUNDWATER
*Bruno, Jessoe, Hanemann in JAERE
*********************************************************************************
*WELL_LEVEL CLEAN
*Starts with dataset that is output from Extraction_preprocessing.do
*Merges and cleans data for analysis
********************************************************************************
clear all
capture log close
set more off

*SELECT OUTPUT DATE
global outputdate = "20230627"		 

*SET DIRECTORY
cd  "D:\Ellen\Dropbox\Pajaro_AgInnovation" 

*DATASET FROM "Extraction_preprocessing.do":
use "Data\well_extraction_20230615.dta", clear 

*MERGE OTHER VARIABLES
merge m:1 year quarter inside using "Data\Price_data_20210527.dta"
drop _merge
merge m:1 year quarter inside using "Data\Delivered_water_aggregate_20210601.dta"
drop _merge
merge m:1 inside year quarter using "Data\Pajaro_zonal_chloride_20210601.dta"
drop _merge
merge m:1 parcelnum year quarter using "Data\Pajaro_chloride_20210601.dta"
drop _merge
merge m:1 parcelnum year using "Data\parcel_landuse_watermerge_LH.dta"
drop _merge
merge m:1 year quarter using "Data\electricity_rates_20210601.dta"
drop _merge

*LABELING
label variable extraction "Extraction (AF)"
label variable inside "Inside DWZ dummy"
label variable price "Pumping price"
label variable delivered "Recycled deliveries"
label variable delivered_rate "Delivered rate"
gen delivered_water = delivered/1000
label variable delivered_water "Delivered Water/1000"
label variable gw_depth "Depth to water table"
label variable CL_zonedate "Chloride"
label variable rent_irrigated "Cropland Rental Rate"
label variable time_of_use "Electricity (time of use) rate"
label variable flat_rate_1B "Electricity (flat) rate"

*CLEANING
preserve
keep if inside ==1
keep if year>2004
keep if year<2017
keep if quarter ==1
bys year: sum extraction, detail
restore


drop if extraction == .
drop if extraction < 0
drop if extraction > 1000
drop if extraction >250 & quarter ==1
drop if acres_tot<1
drop if gw_depth <0


gen lnpump = log(extraction)
gen hsPump = asinh(extraction)
gen lnpump_plus = log(extraction +1)
gen lnprice = log(price)
gen lndelivered= log(delivered_water + 1)
gen lndepth = log(gw_depth)
gen lnCLzd = log(CL_zonedate)
label variable lnCLzd "chloride-quarterly, regional"

gen date = yq(year, quarter)
format date %tq
label variable date "Date"
xtset sitenum date, quarterly 

*Set cluster variable
egen zone_year = group(inside year)
egen time=group(year quarter)
gen time2=time^2
gen time3=time^3
gen time_zone= time*inside 
gen time_county = time*county
egen quarter_zone=group(inside quarter)

*Gen diff in diff variables
gen post = 0
replace post = 1 if date>202
gen treatment = inside*post
label variable treatment "Inside*Post"
label variable post "Post-2010Q4"


gen price_lag1 = price[_n-1]
gen lnprice_lag = ln(price_lag1)



duplicates report sitenum date
duplicates tag sitenum date, gen(duplicate)
drop if duplicate >0
drop duplicate


preserve
drop if lndelivered==.
drop if lnCLzd==.
bys inside: sum extraction price
sum price if date > 202 & inside ==1
sum price if date < 203 & inside ==1
sum extraction if date <203
restore

drop if year<2005

*How many parcels in dataset? 
by parcelnum, sort: gen nvals_parcel = _n == 1 
preserve 
keep if inside==0
count if nvals_parcel
restore

*QUARTERLY DATASET
save "Data\Well_Clean_$outputdate.dta", replace

*Create annual data, where a 12 month period spans Q4 to Q3. Aligns with price change
*Drop first three quarters of 2005
drop if year<2005
drop if year==2005 & quarter!=4
sort year quarter
egen quarter_running=group(year quarter)

gen year_run=.
replace year_run=1 if quarter_running<5
replace year_run=2 if quarter_running>4 & quarter_running<9
replace year_run=3 if quarter_running>8 & quarter_running<13
replace year_run=4 if quarter_running>12 & quarter_running<17
replace year_run=5 if quarter_running>16 & quarter_running<21
replace year_run=6 if quarter_running>20 & quarter_running<25
replace year_run=7 if quarter_running>24 & quarter_running<29
replace year_run=8 if quarter_running>28 & quarter_running<33
replace year_run=9 if quarter_running>32 & quarter_running<37
replace year_run=10 if quarter_running>36 & quarter_running<41
replace year_run=11 if quarter_running>40 & quarter_running<45
replace year_run=12 if quarter_running>44 & quarter_running<49
replace year_run=13 if quarter_running>48 & quarter_running<53
replace year_run=14 if quarter_running>52 & quarter_running<57
replace year_run=15 if quarter_running>56 & quarter_running<61
drop if year_run==.

collapse (sum) extraction delivered (mean) parcel_size_fixed area_acres price CL_zonedate delivered_rate gw_depth rent_irrigated flat_rate time_of_use, by (sitenum parcelnum year_run inside county_code)
xtset sitenum year_run

*Generate treatment variable
gen post = 0
replace post = 1 if year_run>5
gen treatment = inside*post
label variable treatment "Inside*Post"
label variable post "Post-2010Q4"
gen extract_acre = extraction/area_acres

label variable extraction "Extraction (AF)"
label variable price "Pumping price"
label variable delivered "Recycled deliveries"
label variable delivered_rate "Delivered rate"
label variable gw_depth "Depth to water table"
label variable CL_zonedate "Chloride"
label variable rent_irrigated "Cropland Rental Rate"
label variable time_of_use "Electricity (time of use) rate"
label variable flat_rate_1B "Electricity (flat) rate"

*ANNUAL DATASET
save "Data\Well_clean_yearrun_$outputdate.dta", replace

